
/**
 * Copyright (C) 2010-2018 Gordon Fraser, Andrea Arcuri and EvoSuite
 * contributors
 * <p>
 * This file is part of EvoSuite.
 * <p>
 * EvoSuite is free software: you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3.0 of the License, or
 * (at your option) any later version.
 * <p>
 * EvoSuite is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser Public License for more details.
 * <p>
 * You should have received a copy of the GNU Lesser General Public
 * License along with EvoSuite. If not, see <http://www.gnu.org/licenses/>.
 */
package org.evomaster.client.java.instrumentation.coverage.methodreplacement;

import dk.brics.automaton.Automaton;
import dk.brics.automaton.RegExp;
import dk.brics.automaton.State;
import dk.brics.automaton.Transition;
import org.evomaster.client.java.utils.SimpleLogger;
import org.jgrapht.DirectedGraph;
import org.jgrapht.alg.CycleDetector;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.graph.DefaultEdge;
import org.jgrapht.traverse.TopologicalOrderIterator;

import java.util.*;
import java.util.regex.Pattern;

/**
 *  Class used to define the distance between a string and a regex
 */
public class RegexDistanceUtils {

    /*
     * Automatons for regex can be expensive to build. So we cache them,
     * as we might need to access to them several times during the search
     */
    private static Map<String, List<State>> regexStateCache = new HashMap<String, List<State>>();
    private static Map<String, Automaton> regexAutomatonCache = new HashMap<String, Automaton>();

    private static Automaton getRegexAutomaton(String regex) {
        if (!regexAutomatonCache.containsKey(regex)) {
            cacheRegex(regex);
        }
        return regexAutomatonCache.get(regex);
    }

    private static String getRegexInstance(String regex) {
        if (!regexAutomatonCache.containsKey(regex)) {
            cacheRegex(regex);
        }
        Automaton automaton = regexAutomatonCache.get(regex);
        return automaton.getShortestExample(true);
    }

    private static String getNonMatchingRegexInstance(String regex) {
        if (!regexAutomatonCache.containsKey(regex)) {
            cacheRegex(regex);
        }
        Automaton automaton = regexAutomatonCache.get(regex);
        return automaton.getShortestExample(false);
    }

    private static class GraphTransition {

        public enum TransitionType {
            INSERTION, DELETION, REPLACEMENT,
            /**
             * A phantom transition is an artificial transition from the sink/final states to a single artificial sink/state.
             * This is used to simplify the recursion calculation of the subpath costs.
             */
            PHANTOM
        }

        ;

        public final double cost;
        public final int fromRow;
        public final State fromState;
        public final TransitionType type;

        public GraphTransition(double cost, int fromRow, State fromState, TransitionType type) {
            this.cost = cost;
            this.fromRow = fromRow;
            this.fromState = fromState;
            this.type = type;
        }
    }

    /**
     * Normalize x in [0,1]
     *
     * @param x
     * @return
     */
    private static double normalize(double x) {
        return x / (x + 1.0);
    }

    /**
     * Java regular expressions contain predefined character classes which the
     * regex parser cannot handle
     *
     * @param regex
     * @return
     */
    private static String expandRegex(String regex) {
        // .	Any character (may or may not match line terminators)
        // \d	A digit: [0-9]
        String newRegex = regex.replaceAll("\\\\d", "[0-9]");

        // \D	A non-digit: [^0-9]
        newRegex = newRegex.replaceAll("\\\\D", "[^0-9]");

        // \s	A whitespace character: [ \t\n\x0B\f\r]
        newRegex = newRegex.replaceAll("\\\\s", "[ \\t\\n\\f\\r]");

        // \S	A non-whitespace character: [^\s]
        newRegex = newRegex.replaceAll("\\\\S", "[^ \\t\\n\\f\\r]");

        // \w	A word character: [a-zA-Z_0-9]
        newRegex = newRegex.replaceAll("\\\\w", "[a-zA-Z_0-9]");

        // \W	A non-word character: [^\w]
        newRegex = newRegex.replaceAll("\\\\W", "[^a-zA-Z_0-9]");

        if (newRegex.startsWith("^"))
            newRegex = newRegex.substring(1);

        if (newRegex.endsWith("$"))
            newRegex = newRegex.substring(0, newRegex.length() - 1);

        // TODO: Some of these should be handled, not just ignored!
        newRegex = removeFlagExpressions(newRegex);

        newRegex = removeReluctantOperators(newRegex);

        return newRegex;
    }

    private static String removeFlagExpressions(String regex) {
        // Case insensitive
        regex = regex.replaceAll("\\(\\?i\\)", "");

        // Unix lines mode
        regex = regex.replaceAll("\\(\\?d\\)", "");

        // Permit comments and whitespace in pattern
        regex = regex.replaceAll("\\(\\?x\\)", "");

        // Multiline mode
        regex = regex.replaceAll("\\(\\?m\\)", "");

        // Dotall
        regex = regex.replaceAll("\\(\\?s\\)", "");

        // Unicode case
        regex = regex.replaceAll("\\(\\?u\\)", "");

        return regex;
    }

    private static String removeReluctantOperators(String regex) {
        regex = regex.replaceAll("\\+\\?", "\\+");
        regex = regex.replaceAll("\\*\\?", "\\*");
        regex = regex.replaceAll("\\?\\?", "\\?");

        return regex;
    }

    /**
     * Ensure that each row has the full data structures containing the target state
     *
     * @param transitions
     * @param state
     * @param numRows
     */
    private static void ensureState(
            Map<Integer, Map<State, Set<GraphTransition>>> transitions, State state,
            int numRows) {
        for (int row = 0; row <= numRows; row++) {
            if (!transitions.containsKey(row))
                transitions.put(row, new HashMap<State, Set<GraphTransition>>());
            if (!transitions.get(row).containsKey(state))
                transitions.get(row).put(state, new HashSet<GraphTransition>());
        }
    }

    private static void cacheRegex(String regex) {
        String r = expandRegex(regex);
        Automaton automaton = new RegExp(r, RegExp.NONE).toAutomaton();
        automaton.expandSingleton();

        // We convert this to a graph without self-loops in order to determine the topological order
        DirectedGraph<State, DefaultEdge> regexGraph = new DefaultDirectedGraph<State, DefaultEdge>(
                DefaultEdge.class);
        Set<State> visitedStates = new HashSet<State>();
        Queue<State> states = new LinkedList<State>();
        State initialState = automaton.getInitialState();
        states.add(initialState);

        while (!states.isEmpty()) {
            State currentState = states.poll();
            if (visitedStates.contains(currentState))
                continue;
            if (!regexGraph.containsVertex(currentState))
                regexGraph.addVertex(currentState);
            for (Transition t : currentState.getTransitions()) {
                // Need to get rid of back edges, otherwise there is no topological order!
                if (!t.getDest().equals(currentState)) {
                    regexGraph.addVertex(t.getDest());
                    regexGraph.addEdge(currentState, t.getDest());
                    states.add(t.getDest());
                    CycleDetector<State, DefaultEdge> det = new CycleDetector<State, DefaultEdge>(
                            regexGraph);
                    if (det.detectCycles()) {
                        regexGraph.removeEdge(currentState, t.getDest());
                    }
                }
            }
            visitedStates.add(currentState);
        }

        TopologicalOrderIterator<State, DefaultEdge> iterator = new TopologicalOrderIterator<State, DefaultEdge>(
                regexGraph);
        List<State> topologicalOrder = new ArrayList<State>();
        while (iterator.hasNext()) {
            topologicalOrder.add(iterator.next());
        }

        regexStateCache.put(regex, topologicalOrder);
        regexAutomatonCache.put(regex, automaton);
    }

    /**
     * <p>
     * Get the distance between the arg and the given regex.
     * All operations (insertion/deletion/replacement) cost 1.
     * There is no assumption on where and how the operations
     * can be done (ie all sequences are valid).
     * </p>
     */
    public static int getStandardDistance(String arg, String regex) {
        if (!isSupportedRegex(regex)) {
            return getDefaultDistance(arg, regex);
        }
        RegexGraph graph;
        try {
            graph = new RegexGraph(arg, regex);
        }catch (Exception e){
            SimpleLogger.uniqueWarn("Failed to build graph for regex: " + regex);
            return getDefaultDistance(arg, regex);
        }
        CostMatrix matrix = new CostMatrix();

        try {
            return matrix.calculateStandardCost(graph);
        }catch (Exception e){
            SimpleLogger.uniqueWarn("Failed to compute distance cost for regex: " + regex);
            return getDefaultDistance(arg, regex);
        }
    }

    private static int getDefaultDistance(String arg, String regex) {
        Pattern p = Pattern.compile(regex);
        if (p.matcher(arg).matches())
            return 0;
        else
            return 1;

    }

    /**
     * Determine whether the regex requires features that are
     * not supported by the regex automaton library
     *
     * @param regex
     * @return
     */
    private static boolean isSupportedRegex(String regex) {
        if (regex.contains("\\b"))
            return false;

        return true;
    }


    private static Automaton getAndCacheAutomaton(String regex) {
        /*
         * Cache it if first time we build it
         */
        if (!regexAutomatonCache.containsKey(regex)) {
            /*
             * Create an automaton representing the regex
             */
            cacheRegex(regex);
        }

        Automaton automaton = regexAutomatonCache.get(regex);
        return automaton;
    }


    /**
     * A graph created based on an "arg" that is matched against a "regex".
     * There is going to be arg.length+1 copies of the regex automaton.
     * Each copy represents a "row".
     * Each automaton state, in topological order, represents a "column".
     * The graph can be considered as a "rows"x"columns" matrix.
     *
     * @author arcuri
     *
     */
    private static class RegexGraph {

        private Map<Integer, Map<State, Set<GraphTransition>>> transitions;
        private Map<Integer, State> intToStateMap;
        private Map<State, Integer> stateToIntMap;

        /**
         * Build the graph
         * @param arg
         * @param regex
         */
        public RegexGraph(String arg, String regex) {
            transitions = createGraph(arg, regex);
        }

        public int getNumberOfRows() {
            return transitions.keySet().size();
        }

        public int getNumberOfColumns() {
            return stateToIntMap.size();
        }

        /**
         * Get all the incoming transitions to the node located at coordinate "row" and "column"
         * @param row
         * @param column
         * @return
         */
        public Set<GraphTransition> getIncomingTransitions(int row, int column) {
            State state = intToStateMap.get(column);
            return transitions.get(row).get(state);
        }

        public int getColumn(State state) {
            return stateToIntMap.get(state);
        }

        private Map<Integer, Map<State, Set<GraphTransition>>> createGraph(String arg, String regex) {

            /*
             * Create a graph to calculate the distance. The algorithm is based on what discussed in:
             *
             * Mohammad Alshraideh and Leonardo Bottaci
             * Search-based software test data generation for string data using program-specific search operators
             * http://neo.lcc.uma.es/mase/attachments/085_TestDataGenerationForStringData.pdf
             *
             * and
             *
             * EUGENE W. MYERS and WEBB MILLER
             * APPROXIMATE MATCHING OF REGULAR EXPRESSIONS
             * http://www.cs.mun.ca/~harold/Courses/Old/Ling6800.W06/Diary/reg.aprox.pdf
             */

            Automaton automaton = getAndCacheAutomaton(regex);
            final int NUM_CHARS = arg.length();


            List<State> topologicalOrder = regexStateCache.get(regex);

            Map<Integer, Map<State, Set<GraphTransition>>> transitions = new HashMap<Integer, Map<State, Set<GraphTransition>>>();

            intToStateMap = new HashMap<Integer, State>();
            stateToIntMap = new HashMap<State, Integer>();
            int numState = 0;

            for (State currentState : topologicalOrder) {

                /*
                 * Init data structure to quickly map/access state/index
                 */
                stateToIntMap.put(currentState, numState);
                intToStateMap.put(numState, currentState);
                numState++;

                for (Transition t : currentState.getTransitions()) {

                    State destination = t.getDest();
                    ensureState(transitions, destination, NUM_CHARS);

                    for (int row = 0; row <= NUM_CHARS; row++) {
                        /*
                         *  add an insertion edge from currentState in row to target state in same row
                         */

                        transitions.get(row).get(destination).add(new GraphTransition(1.0, row, currentState, GraphTransition.TransitionType.INSERTION));
                    }

                    for (int row = 0; row < NUM_CHARS; row++) {
                        /*
                         *  Add a replacement edge from currentState in row to t.getDest in row+1
                         *  if charAt row+1 == the parameter of this transition, this is a zero-cost edge
                         */

                        double cost = 0.0;

                        if (arg.charAt(row) < t.getMin() || arg.charAt(row) > t.getMax()) {
                            int distMin = Math.abs(arg.charAt(row) - t.getMin());
                            int distMax = Math.abs(arg.charAt(row) - t.getMax());
                            cost = normalize(Math.min(distMin, distMax));
                        }

                        /*
                         * Important: even if the cost is 0 (eg match on the arg/regex in which we replace char X with X), we CANNOT
                         * use a PHANTOM transition. Even if we do not replace anything, we still need to consider it as a replacement
                         * transition. Consider the case
                         *
                         *  "ac".matches("abc")
                         *
                         *  If we used a phantom transition to represent the alignment c/c, then it would be possible to insert 'b' in the
                         *  middle of "abc". On the other hand, if we use a replacement c/c, then inserting 'b' would not be allowed, as an
                         *  insertion cannot be followed by a replacement.
                         */

                        transitions.get(row + 1).get(destination).add(new GraphTransition(cost, row, currentState, GraphTransition.TransitionType.REPLACEMENT));
                    }
                }

                ensureState(transitions, currentState, NUM_CHARS);

                for (int row = 0; row < NUM_CHARS; row++) {

                    /*
                     * add a deletion edge with cost 1 from currentState to currentState in next row
                     */

                    transitions.get(row + 1).get(currentState).add(new GraphTransition(1.0, row, currentState, GraphTransition.TransitionType.DELETION));
                }
            }

            // Add zero-cost transitions from accepting states to final state
            State finalState = new State();
            ensureState(transitions, finalState, NUM_CHARS);
            for (State s : automaton.getStates()) {
                if (s.isAccept()) {
                    transitions.get(NUM_CHARS).get(finalState).add(new GraphTransition(0, NUM_CHARS, s, GraphTransition.TransitionType.PHANTOM));
                }
            }
            intToStateMap.put(numState, finalState);
            stateToIntMap.put(finalState, numState);

            return transitions;
        }
    }

    /**
     * Class used to calculate the cost, ie the actual distance, based on a RegexGraph.
     *
     * @author arcuri
     */
    private static class CostMatrix {

        private final int DEL = 0;
        private final int REP = 1;
        private final int INS = 2;

        public CostMatrix() {
            super();
        }

        public int calculateStandardCost(RegexGraph graph) {
            final int ROWS = graph.getNumberOfRows();
            final int COLUMNS = graph.getNumberOfColumns();

            final double[][] matrix = new double[ROWS][COLUMNS];

            // First row is cost of matching empty sequence on regex
            final int FIRST_ROW = 0;

            /*
             * init first starting state with 0 costs
             */
            matrix[FIRST_ROW][0] = 0;

            //look at first row (which is special)
            for (int col = 1; col < graph.getNumberOfColumns(); col++) {

                double min = Double.MAX_VALUE;

                for (GraphTransition t : graph.getIncomingTransitions(FIRST_ROW, col)) {

                    int otherCol = graph.getColumn(t.fromState);

                    //self transition
                    if (col == otherCol) {
                        continue;
                    }

                    double otherCost = matrix[FIRST_ROW][otherCol];

                    min = Math.min(min, getSubPathCost(otherCost, Math.ceil(t.cost)));
                }

                matrix[FIRST_ROW][col] = min;
            }

            //then look at the other rows
            for (int i = 1; i < ROWS; i++) {

                for (int col = 0; col < COLUMNS; col++) {

                    matrix[i][col] = Double.MAX_VALUE;

                    for (GraphTransition t : graph.getIncomingTransitions(i, col)) {

                        int otherCol = graph.getColumn(t.fromState);
                        int otherRow = t.fromRow;

                        if (!t.type.equals(GraphTransition.TransitionType.PHANTOM)) {
                            matrix[i][col] = Math.min(matrix[i][col], getSubPathCost(matrix[otherRow][otherCol], Math.ceil(t.cost)));
                        } else {
                            /*
                             * artificial transition to final/sink state, so just take same values as previous state
                             */
                            matrix[i][col] = Math.min(matrix[i][col], matrix[otherRow][otherCol]);

                        }
                    }
                }
            }

            double min = matrix[ROWS - 1][COLUMNS - 1];
            return (int) Math.round(min);
        }

        /**
         * Note: this is different from normal matching algorithms, as we enforce an order
         * among the operators: delete, replace and then insert.
         * @param graph
         * @return
         */
        public double calculateCostForStringAVM(RegexGraph graph) {

            final int ROWS = graph.getNumberOfRows();
            final int COLUMNS = graph.getNumberOfColumns();

            /*
             * we create a matrix based on each row and each column in the graph.
             * Each cell has 3 values, each representing the cost of thre different types of path:
             *
             * 0) only deletion
             * 1) deletions followed by replacement
             * 2) as above, and then followed by insertions
             */
            final double[][][] matrix = new double[ROWS][COLUMNS][3];

            calculateInsertionCostOnFirstRow(graph, matrix);

            for (int i = 1; i < ROWS; i++) {

                for (int col = 0; col < COLUMNS; col++) {

                    /*
                     * unless a path is explicitly updated, it will have maximum distance by default
                     */
                    matrix[i][col][DEL] = Double.MAX_VALUE;
                    matrix[i][col][REP] = Double.MAX_VALUE;
                    matrix[i][col][INS] = Double.MAX_VALUE;

                    for (GraphTransition t : graph.getIncomingTransitions(i, col)) {

                        int otherCol = graph.getColumn(t.fromState);
                        int otherRow = t.fromRow;

                        if (t.type.equals(GraphTransition.TransitionType.INSERTION)) {
                            assert otherRow == i;
                            /*
                             * if we have an insertion, only the insertion path can be continued.
                             * that's the reason why on the left side we only update for [INS].
                             * An insertion can continue any type of path (and so all types are present on the right side).
                             */
                            matrix[i][col][INS] = Math.min(matrix[i][col][INS], getSubPathCost(matrix[otherRow][otherCol][DEL], t.cost));
                            matrix[i][col][INS] = Math.min(matrix[i][col][INS], getSubPathCost(matrix[otherRow][otherCol][REP], t.cost));
                            matrix[i][col][INS] = Math.min(matrix[i][col][INS], getSubPathCost(matrix[otherRow][otherCol][INS], t.cost));
                        } else if (t.type.equals(GraphTransition.TransitionType.REPLACEMENT)) {
                            /*
                             * if we have a replacement, then we cannot continue a delete path.
                             * So, no [DEL] on the left side.
                             * A replacement can continue a delete or replace path, but not an insertion one (and so [DEL] and
                             * [REP] on right side)
                             */
                            matrix[i][col][REP] = Math.min(matrix[i][col][REP], getSubPathCost(matrix[otherRow][otherCol][DEL], t.cost));
                            matrix[i][col][REP] = Math.min(matrix[i][col][REP], getSubPathCost(matrix[otherRow][otherCol][REP], t.cost));
                            /*
                             * from this state on, an insertion path can be followed, with same cost (ie right side) as replacement path
                             */
                            matrix[i][col][INS] = Math.min(matrix[i][col][INS], getSubPathCost(matrix[otherRow][otherCol][DEL], t.cost));
                            matrix[i][col][INS] = Math.min(matrix[i][col][INS], getSubPathCost(matrix[otherRow][otherCol][REP], t.cost));
                        } else if (t.type.equals(GraphTransition.TransitionType.DELETION)) {
                            /*
                             * deletion can only follow a deletion path (so only [DEL] or right side).
                             * but, from this state on, any new path can be followed (so all on left side)
                             */
                            matrix[i][col][DEL] = Math.min(matrix[i][col][DEL], getSubPathCost(matrix[otherRow][otherCol][DEL], t.cost));
                            matrix[i][col][REP] = Math.min(matrix[i][col][REP], getSubPathCost(matrix[otherRow][otherCol][DEL], t.cost));
                            matrix[i][col][INS] = Math.min(matrix[i][col][INS], getSubPathCost(matrix[otherRow][otherCol][DEL], t.cost));
                        } else if (t.type.equals(GraphTransition.TransitionType.PHANTOM)) {
                            assert t.cost == 0;
                            /*
                             * artificial transition to final/sink state, so just take same values as previous state
                             */
                            matrix[i][col][DEL] = Math.min(matrix[i][col][DEL], matrix[otherRow][otherCol][DEL]);
                            matrix[i][col][REP] = Math.min(matrix[i][col][REP], matrix[otherRow][otherCol][REP]);
                            matrix[i][col][INS] = Math.min(matrix[i][col][INS], matrix[otherRow][otherCol][INS]);
                        }
                    }
                }

                /*
                 * TODO: The algorithm of Myers's paper, at page 12, makes a distinction between D and E transitions.
                 * Insertions of type E are done last. Not fully clear if it has an effect here: ie, recall that
                 * here we do minimization (calculate distance) and not maximization (similarity)
                 */
            }

            /*
             * get the minimum among the 3 different paths in the sink state
             */
            double min = Double.MAX_VALUE;
            for (double value : matrix[ROWS - 1][COLUMNS - 1]) {
                if (value < min) {
                    min = value;
                }
            }

            return min;
        }

        /**
         * We cannot just do previousStateCost + transitionCost, as there might be computational overflows
         *
         * @param previousStateCost
         * @param transitionCost
         * @return
         * @throws IllegalArgumentException
         */
        private double getSubPathCost(double previousStateCost, double transitionCost) throws IllegalArgumentException {
            if (previousStateCost < 0) {
                throw new IllegalArgumentException("previousStateCost cannot be negative: " + previousStateCost);
            }
            if (transitionCost < 0) {
                throw new IllegalArgumentException("transitionCost cannot be negative: " + transitionCost);
            }

            if (previousStateCost == Double.MAX_VALUE || transitionCost == Double.MAX_VALUE) {
                return Double.MAX_VALUE;
            }

            double sum = previousStateCost + transitionCost;

            if (sum < previousStateCost || sum < transitionCost) {
                /*
                 * likely overflow
                 */
                return Double.MAX_VALUE;
            }

            return sum;
        }

        /**
         * First row is special, ie very different from the others
         *
         * @param graph
         * @param matrix
         */
        private void calculateInsertionCostOnFirstRow(RegexGraph graph, final double[][][] matrix) {

            // First row is cost of matching empty sequence on regex
            final int FIRST_ROW = 0;

            /*
             * init first starting state with 0 costs
             */
            matrix[FIRST_ROW][0][0] = 0;
            matrix[FIRST_ROW][0][1] = 0;
            matrix[FIRST_ROW][0][2] = 0;

            for (int col = 1; col < graph.getNumberOfColumns(); col++) {

                double min = Double.MAX_VALUE;

                for (GraphTransition t : graph.getIncomingTransitions(FIRST_ROW, col)) {

                    /*
                     * on first row, there can be only insertions coming from the same row,
                     * apart from last node that can have a phantom transition to sink state
                     */
                    assert t.type.equals(GraphTransition.TransitionType.INSERTION) ||
                            t.type.equals(GraphTransition.TransitionType.PHANTOM);
                    assert t.fromRow == 0;

                    int otherCol = graph.getColumn(t.fromState);

                    //self transition
                    if (col == otherCol) {
                        continue;
                    }

                    double otherCost = matrix[FIRST_ROW][otherCol][2];

                    min = Math.min(min, getSubPathCost(otherCost, t.cost));
                }

                /*
                 * as there can be only insertions, the delete and replace paths cannot be followed, and
                 * so maximum distance
                 */
                matrix[FIRST_ROW][col][0] = Double.MAX_VALUE;
                matrix[FIRST_ROW][col][1] = Double.MAX_VALUE;
                matrix[FIRST_ROW][col][2] = min;
            }
        }
    }
}

